Define heatmap function
create_gene_heatmap <- function(de_results_df, dge_object, font_size = 12, num_genes = 20) {
# Subset the genes
top_genes <- de_results_df$Symbol[1:num_genes]
subset_matrix <- dge_object$logCPM[top_genes, colnames(dge_object$counts)]
# Create annotation dataframe
sample_info <- dge_object$samples
annotation_df <- data.frame(
Donor = sample_info$Donor,
Subset = sample_info$Subset,
Timepoint = sample_info$Timepoint
)
rownames(annotation_df) <- colnames(subset_matrix)
# Reorder gene expression matrix
ordered_samples <- order(sample_info$Timepoint, sample_info$Donor)
subset_matrix <- subset_matrix[, ordered_samples]
sample_info <- sample_info[ordered_samples, ]
# Create the heatmap
heatmap <- pheatmap(subset_matrix,
scale = "row",
cluster_rows = FALSE,
cluster_cols = FALSE,
color = viridis(50),
show_colnames = FALSE,
annotation_col = annotation_df,
fontsize = font_size,
silent = FALSE)
}The aim of this notebook is to identify genes that differ between CD4 and CD8 subsets over time. This will be a basic DE analysis informed by the time course centric analysis conducted in notebook 3A splines timecourse.
I sequenced these samples with Prime-seq on extracted RNAs already. Results were not that great. Here reprocess the same samples with TIRE-seq for a head to head comparison.
The processing went as planned. Full writeup available at ELN link
This was generated in 2A_clustering
sce <- readRDS(here::here(
"data/TIRE_Tcell/SCEs/tcell_cluster.sce.rds"
))
# Reorder factor levels
sce$Timepoint <- factor(sce$Timepoint,
levels = c("Day_0", "Day_1", "Day_2",
"Day_5", "Day_6", "Day_8",
"Day_9", "Day_13", "Day_15"))
keep_samples <- c("Day_0", "Day_2","Day_15")
dge <- scran::convertTo(sce[,sce$Timepoint %in% keep_samples], type="edgeR")
tb <- as_tibble(dge$samples)
dge$samples$Timepoint <- droplevels(dge$samples$Timepoint)Check on the perturbations conducted in this experiment:
Filter for genes that have at least 5 counts.
Currently keep 8,249
## Mode FALSE TRUE
## logical 14005 6910
Correct for composition biases by computing normalization factors with the trimmed mean of M-values method.
Combine time and subset into a new factor. For an interaction analysis it is easier to be more explicit than having fancy multiplication terms i.e timepoint * subset
The intercept term in a design matrix represents the baseline or reference level in your experimental design. Specifically:
Decide on the contrasts.
Based on the timecourse centric analysis in notebook 3A_time_spline focus on the 2 timepoints with the mpst dynamic changes
contr.matrix <- makeContrasts(
Day0_vs_Day2 = TimepointDay_2 - TimepointDay_0,
Day2_vs_Day15 = TimepointDay_15 - TimepointDay_2,
levels = colnames(sm))
contr.matrix %>%
kable()| Day0_vs_Day2 | Day2_vs_Day15 | |
|---|---|---|
| TimepointDay_0 | -1 | 0 |
| TimepointDay_2 | 1 | -1 |
| TimepointDay_15 | 0 | 1 |
| DonorDonor_64 | 0 | 0 |
In each contrast, the format is A - B where:
So in this analysis the positive log fold changes will be the later of the 2 timepoints being compared.
Fit this model using limma/ voom.
par(mfrow=c(1,2))
v <- voom(dge, sm, plot=TRUE)
vfit <- lmFit(v, sm)
vfit <- contrasts.fit(vfit, contrasts=contr.matrix)
efit <- eBayes(vfit)
plotSA(efit, main="Final model: Mean-variance trend")CD8 is highest which is a good sanity check.
day2_vs_day0 <- topTable(efit, coef=1, n=length(efit$genes$ID), sort.by = "logFC")
results <- as_tibble(day2_vs_day0)
results$ID <- rownames(day2_vs_day0)
# add a column of NAs
results$DElabel <- "n/s"
# if log2Foldchange > 1 or < -1 and pvalue < 0.05, set as "UP"
results$DElabel[results$logFC > 1 & results$adj.P.Val < 0.1] <- "Day2"
results$DElabel[results$logFC < -1 & results$adj.P.Val < 0.1] <- "Day0"Add gene labels
results$genelabels <- ""
results$genelabels[results$Symbol == "FGL2"] <- "FGL2"
results$genelabels[results$Symbol == "LINC00861"] <- "LINC00861"
results$genelabels[results$Symbol == "KLF2"] <- "KLF2"
results$genelabels[results$Symbol == "SDK2"] <- "SDK2"
results$genelabels[results$Symbol == "CDK1"] <- "CDK1"
results$genelabels[results$Symbol == "MKI67"] <- "MKI67"
results$genelabels[results$Symbol == "TOP2A"] <- "TOP2A"
results$genelabels[results$Symbol == "CDC45"] <- "CDC45"All markers of proliferation in day 2
results$DElabel <- factor(results$DElabel, levels = c("Day2", "n/s", "Day0")) # Adjust as per your actual labels
plt1 <- ggplot(data = results, aes(x = logFC, y = -log10(adj.P.Val), colour = DElabel, label = genelabels)) +
geom_point(alpha = 0.33, size = 1.5) +
geom_text_repel(size = 4, colour = "black") +
guides(colour = guide_legend(override.aes = list(size = 3, alpha = 1))) +
scale_color_manual(
values = c("darkorange", "grey", "darkblue"), # Colors in the desired order
name = "Upregulated",
labels = c("Day2", "n/s", "Day0") # Optional: Add custom labels
) +
geom_vline(xintercept = 1, linetype = "dotted") +
geom_vline(xintercept = -1, linetype = "dotted") +
theme_Publication()
plt1dge$logCPM <- cpm(dge, log=TRUE, prior.count=1)
dge_subset <- dge[,dge$samples$Timepoint %in% c("Day_0", "Day_2")]
create_gene_heatmap(de_results_df = results, dge_object = dge_subset, font_size = 12, num_genes=15)Need to convert geneIDs from ensembl to enterez
geneids <- as.data.frame(v$genes$ID)
colnames(geneids) <- "ENSEMBL"
geneids$entrez <- mapIds(org.Hs.eg.db, keys = geneids$ENSEMBL, keytype = "ENSEMBL", column = "ENTREZID")
load("data/MSigDB/human_H_v5p2.rdata")
idx <- ids2indices(Hs.H,identifiers = geneids$entrez)Geneset testing
Visualize as a barplot.
Nothing that interesting proliferation a very strong signal
There are a set of highly significant genes switched off between day 15 and 2
day15_v_2 <- topTable(efit, coef=2, n=length(efit$genes$ID), sort.by = "logFC")
results <- as_tibble(day15_v_2)
results$ID <- rownames(day15_v_2)
# add a column of NAs
results$DElabel <- "n/s"
# if log2Foldchange > 1 or < -1 and pvalue < 0.05, set as "UP"
results$DElabel[results$logFC > 1 & results$adj.P.Val < 0.1] <- "Day15"
results$DElabel[results$logFC < -1 & results$adj.P.Val < 0.1] <- "Day2"Add gene labels
results$genelabels <- ""
results$genelabels[results$Symbol == "KRT1"] <- "KRT1"
results$genelabels[results$Symbol == "AL158071.3"] <- "AL158071.3"
results$genelabels[results$Symbol == "CCR2"] <- "CCR2"
results$genelabels[results$Symbol == "MMP25"] <- "MMP25"
results$genelabels[results$Symbol == "CD52"] <- "CD52"
results$genelabels[results$Symbol == "IRF8"] <- "IRF8"
results$genelabels[results$Symbol == "RGS16"] <- "RGS16"
results$genelabels[results$Symbol == "PMAIP1"] <- "PMAIP1"
results$genelabels[results$Symbol == "IFIT3"] <- "IFIT3"
results$genelabels[results$Symbol == "HSP90AA1"] <- "HSP90AA1"results$DElabel <- factor(results$DElabel, levels = c("Day15", "n/s", "Day2")) # Adjust as per your actual labels
plt2 <- ggplot(data = results, aes(x = logFC, y = -log10(adj.P.Val), colour = DElabel, label = genelabels)) +
geom_point(alpha = 0.33, size = 1.5) +
geom_text_repel(size = 4, colour = "black") +
guides(colour = guide_legend(override.aes = list(size = 3, alpha = 1))) +
scale_color_manual(
values = c("darkorange", "grey", "darkblue"), # Colors in the desired order
name = "Upregulated",
labels = c("Day15", "n/s", "Day2") # Optional: Add custom labels
) +
geom_vline(xintercept = 1, linetype = "dotted") +
geom_vline(xintercept = -1, linetype = "dotted") +
theme_Publication()
plt2dge$logCPM <- cpm(dge, log=TRUE, prior.count=1)
dge_subset <- dge[,dge$samples$Timepoint %in% c("Day_15", "Day_2")]
create_gene_heatmap(de_results_df = results, dge_object = dge_subset, font_size = 12, num_genes=15)Nothing that interesting proliferation a very strong signal
Visualize as a barplot
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Red Hat Enterprise Linux 9.5 (Plow)
##
## Matrix products: default
## BLAS: /stornext/System/data/software/rhel/9/base/tools/R/4.4.1/lib64/R/lib/libRblas.so
## LAPACK: /stornext/System/data/software/rhel/9/base/tools/R/4.4.1/lib64/R/lib/libRlapack.so; LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
##
## attached base packages:
## [1] grid splines stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] ggthemes_5.1.0 here_1.0.1
## [3] knitr_1.48 patchwork_1.3.0
## [5] org.Hs.eg.db_3.19.1 AnnotationDbi_1.66.0
## [7] ggrepel_0.9.6 viridis_0.6.5
## [9] viridisLite_0.4.2 pheatmap_1.0.12
## [11] edgeR_4.2.2 limma_3.60.6
## [13] scater_1.32.1 scuttle_1.14.0
## [15] lubridate_1.9.3 forcats_1.0.0
## [17] stringr_1.5.1 dplyr_1.1.4
## [19] purrr_1.0.2 readr_2.1.5
## [21] tidyr_1.3.1 tibble_3.2.1
## [23] ggplot2_3.5.1 tidyverse_2.0.0
## [25] SingleCellExperiment_1.26.0 SummarizedExperiment_1.34.0
## [27] Biobase_2.64.0 GenomicRanges_1.56.2
## [29] GenomeInfoDb_1.40.1 IRanges_2.38.1
## [31] S4Vectors_0.42.1 BiocGenerics_0.50.0
## [33] MatrixGenerics_1.16.0 matrixStats_1.4.1
##
## loaded via a namespace (and not attached):
## [1] DBI_1.2.3 gridExtra_2.3
## [3] rlang_1.1.4 magrittr_2.0.3
## [5] compiler_4.4.1 RSQLite_2.3.7
## [7] DelayedMatrixStats_1.26.0 png_0.1-8
## [9] vctrs_0.6.5 pkgconfig_2.0.3
## [11] crayon_1.5.3 fastmap_1.2.0
## [13] XVector_0.44.0 labeling_0.4.3
## [15] utf8_1.2.4 rmarkdown_2.28
## [17] tzdb_0.4.0 UCSC.utils_1.0.0
## [19] ggbeeswarm_0.7.2 bit_4.5.0
## [21] bluster_1.14.0 xfun_0.48
## [23] zlibbioc_1.50.0 cachem_1.1.0
## [25] beachmat_2.20.0 jsonlite_1.8.9
## [27] blob_1.2.4 highr_0.11
## [29] DelayedArray_0.30.1 BiocParallel_1.38.0
## [31] cluster_2.1.6 irlba_2.3.5.1
## [33] parallel_4.4.1 R6_2.5.1
## [35] bslib_0.8.0 stringi_1.8.4
## [37] RColorBrewer_1.1-3 jquerylib_0.1.4
## [39] Rcpp_1.0.13 igraph_2.0.3
## [41] Matrix_1.7-0 timechange_0.3.0
## [43] tidyselect_1.2.1 rstudioapi_0.17.0
## [45] abind_1.4-8 yaml_2.3.10
## [47] codetools_0.2-20 lattice_0.22-6
## [49] KEGGREST_1.44.1 withr_3.0.1
## [51] evaluate_1.0.1 Biostrings_2.72.1
## [53] pillar_1.9.0 generics_0.1.3
## [55] rprojroot_2.0.4 hms_1.1.3
## [57] sparseMatrixStats_1.16.0 munsell_0.5.1
## [59] scales_1.3.0 glue_1.8.0
## [61] metapod_1.12.0 tools_4.4.1
## [63] BiocNeighbors_1.22.0 ScaledMatrix_1.12.0
## [65] locfit_1.5-9.10 scran_1.32.0
## [67] colorspace_2.1-1 GenomeInfoDbData_1.2.12
## [69] beeswarm_0.4.0 BiocSingular_1.20.0
## [71] vipor_0.4.7 cli_3.6.3
## [73] rsvd_1.0.5 fansi_1.0.6
## [75] S4Arrays_1.4.1 gtable_0.3.5
## [77] sass_0.4.9 digest_0.6.37
## [79] dqrng_0.4.1 SparseArray_1.4.8
## [81] farver_2.1.2 memoise_2.0.1
## [83] htmltools_0.5.8.1 lifecycle_1.0.4
## [85] httr_1.4.7 statmod_1.5.0
## [87] bit64_4.5.2